import pandas as pd
import numpy as np
ads = pd.read_csv("train_data_ads.csv")
feeds = pd.read_csv("train_data_feeds.csv")
ads = ads.drop_duplicates(subset='user_id', keep='first', inplace=False)
feeds = feeds.drop_duplicates(subset='u_userId', keep='first', inplace=False)
feeds.shape
(180123, 28)
ads.shape
(65297, 35)
feeds['user_id'] = feeds['u_userId']
feeds = feeds.drop('u_userId', axis = 1)
merged = pd.merge(ads, feeds, on = 'user_id', how = 'outer')
merged.head()
| log_id | label_x | user_id | age | gender | residence | city | city_rank | series_dev | series_group | ... | e_ch | e_m | e_po | e_pl | e_rn | e_section | e_et | label_y | cillabel | pro | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 373250.0 | 0.0 | 100005 | 3.0 | 2.0 | 16.0 | 147.0 | 2.0 | 32.0 | 6.0 | ... | 19 | 1123 | 13 | 2836 | 1 | 0 | 202206031527 | -1 | -1 | 0 |
| 1 | 101100.0 | 0.0 | 100006 | 5.0 | 2.0 | 13.0 | 191.0 | 4.0 | 32.0 | 6.0 | ... | 19 | 998 | 6 | 835 | 1 | 1 | 202206031125 | -1 | -1 | 0 |
| 2 | 742637.0 | 0.0 | 100009 | 5.0 | 2.0 | 46.0 | 354.0 | 2.0 | 11.0 | 8.0 | ... | 19 | 508 | 2 | 407 | 3 | 0 | 202206080528 | -1 | -1 | 0 |
| 3 | 744753.0 | 0.0 | 100010 | 3.0 | 4.0 | 33.0 | 319.0 | 3.0 | 31.0 | 3.0 | ... | 11 | 1361 | 3 | 2383 | 3 | 0 | 202206032213 | -1 | -1 | 0 |
| 4 | 669191.0 | 0.0 | 100019 | 7.0 | 2.0 | 16.0 | 310.0 | 2.0 | 16.0 | 5.0 | ... | 12 | 1319 | 5 | 2240 | 2 | 0 | 202206032128 | -1 | -1 | 0 |
5 rows × 62 columns
merged.shape
(180123, 62)
76756 + 32278
109034
merged.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro'],
dtype='object')
df_cust = merged[merged['label_y'] == 1]
df_cust.shape
(14450, 62)
df_cust
| log_id | label_x | user_id | age | gender | residence | city | city_rank | series_dev | series_group | ... | e_ch | e_m | e_po | e_pl | e_rn | e_section | e_et | label_y | cillabel | pro | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18 | 583728.0 | 0.0 | 100108 | 2.0 | 3.0 | 21.0 | 200.0 | 4.0 | 30.0 | 3.0 | ... | 19 | 506 | 20 | 1100 | 1 | 0 | 202206031522 | 1 | -1 | 100 |
| 21 | 364370.0 | 0.0 | 100127 | 7.0 | 2.0 | 17.0 | 343.0 | 5.0 | 16.0 | 5.0 | ... | 19 | 705 | 6 | 2760 | 1 | 1 | 202206080624 | 1 | -1 | 100 |
| 23 | 588242.0 | 0.0 | 100149 | 8.0 | 2.0 | 16.0 | 425.0 | 2.0 | 34.0 | 7.0 | ... | 19 | 928 | 11 | 1535 | 1 | 0 | 202206080717 | 1 | -1 | 100 |
| 27 | 679513.0 | 0.0 | 100158 | 6.0 | 4.0 | 33.0 | 319.0 | 3.0 | 27.0 | 2.0 | ... | 19 | 1193 | 9 | 332 | 1 | 0 | 202206030115 | 1 | -1 | 100 |
| 29 | 1084910.0 | 0.0 | 100166 | 5.0 | 2.0 | 30.0 | 113.0 | 5.0 | 16.0 | 5.0 | ... | 19 | 1424 | 6 | 879 | 1 | 1 | 202206031126 | 1 | -1 | 100 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 179951 | NaN | NaN | 131907 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 19 | 1444 | 13 | 1170 | 1 | 0 | 202206051322 | 1 | -1 | 40 |
| 179959 | NaN | NaN | 123724 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 19 | 1089 | 12 | 2658 | 1 | 0 | 202206051923 | 1 | -1 | 80 |
| 179996 | NaN | NaN | 215157 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 19 | 1319 | 13 | 2420 | 1 | 0 | 202206050917 | 1 | -1 | 60 |
| 179997 | NaN | NaN | 107610 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 19 | 1075 | 20 | 505 | 1 | 0 | 202206050816 | 1 | -1 | 0 |
| 180013 | NaN | NaN | 246671 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 19 | 620 | 6 | 2042 | 1 | 1 | 202206050116 | 1 | -1 | 0 |
14450 rows × 62 columns
import matplotlib.pyplot as plt
plt.hist(df_cust['age'])
(array([ 874., 1744., 184., 0., 1673., 922., 0., 1589., 1388.,
204.]),
array([2. , 2.7, 3.4, 4.1, 4.8, 5.5, 6.2, 6.9, 7.6, 8.3, 9. ]),
<BarContainer object of 10 artists>)
plt.hist(df_cust['residence'])
(array([ 651., 1013., 2093., 353., 983., 626., 1544., 77., 562.,
676.]),
array([11. , 14.5, 18. , 21.5, 25. , 28.5, 32. , 35.5, 39. , 42.5, 46. ]),
<BarContainer object of 10 artists>)
plt.hist(df_cust['city'])
(array([ 843., 888., 717., 733., 621., 651., 1750., 1079., 477.,
819.]),
array([101., 135., 169., 203., 237., 271., 305., 339., 373., 407., 441.]),
<BarContainer object of 10 artists>)
plt.hist(df_cust['series_group'])
(array([1065., 1926., 0., 643., 0., 1705., 1311., 0., 1153.,
775.]),
array([2. , 2.6, 3.2, 3.8, 4.4, 5. , 5.6, 6.2, 6.8, 7.4, 8. ]),
<BarContainer object of 10 artists>)
plt.hist(df_cust['e_section'])
(array([8685., 0., 0., 0., 0., 0., 0., 0., 0.,
5765.]),
array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
<BarContainer object of 10 artists>)
import plotly.express as px
fig = px.pie(df_cust, values='age', names='age', title = "Potential Customer Age Distribution")
fig.show()
fig = px.box(df_cust, x="age", title = "Potential Customer Age Distribution (Boxplot)")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values='residence', names='residence', title = "Potential Customer Residence Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values='city', names='city')
fig.show()
merged['e_section'].value_counts()
1 109898 0 70225 Name: e_section, dtype: int64
df_cust['e_section'].value_counts()
0 8685 1 5765 Name: e_section, dtype: int64
value_counts = df_cust['e_section'].value_counts()
count_0 = value_counts.get(0, 0)
count_1 = value_counts.get(1, 0)
labels = ['0', '1']
values = [count_0, count_1]
fig = px.pie(values=values, names=labels, title='Distribution of content preferences among Potential Customers')
fig.show()
df_cust.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro'],
dtype='object')
import plotly.express as px
fig = px.pie(df_cust, values='series_dev', names='series_dev', title = "Potential Customer Device Series Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values='series_group', names='series_group', title = "Potential Customer Device Series Group Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values='emui_dev', names='emui_dev', title = "Potential Customer Device EMUI Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values='device_name', names='device_name', title = "Potential Customer Device Name Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values='device_size', names='device_size', title = "Potential Customer Device Size Distribution")
fig.show()
df_cust.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro'],
dtype='object')
df_cust['pt_d'] = pd.to_datetime(df_cust['pt_d'], format='%Y%m%d%H%M')
df_cust['e_et'] = pd.to_datetime(df_cust['e_et'], format='%Y%m%d%H%M')
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\194800331.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\194800331.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_cust['ads_hour'] = df_cust['pt_d'].dt.hour
df_cust['feeds_hour'] = df_cust['e_et'].dt.hour
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\2629954945.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\2629954945.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_cust['ads_day'] = df_cust['pt_d'].dt.dayofweek
df_cust['feeds_day'] = df_cust['e_et'].dt.dayofweek
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\1720565557.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\1720565557.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_cust['ads_dayname'] = df_cust['pt_d'].dt.day_name()
df_cust['feeds_dayname'] = df_cust['e_et'].dt.day_name()
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\1132003763.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\1132003763.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_cust.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro', 'ads_hour', 'feeds_hour', 'ads_day', 'feeds_day',
'ads_dayname', 'feeds_dayname'],
dtype='object')
import plotly.express as px
fig = px.pie(df_cust, values='ads_hour', names='ads_hour', title = "Potential Customer Advertisement Hour Viewed Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values= df_cust['feeds_hour'].value_counts().values, names=df_cust['feeds_hour'].value_counts().index, title = "Potential Customer Feeds Hour Viewed Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values= df_cust['ads_day'].value_counts().values, names=df_cust['ads_dayname'].value_counts().index, title = "Potential Customer Advertisement Day Viewed Distribution")
fig.show()
value_counts = df_cust['e_section'].value_counts()
count_0 = value_counts.get(0, 0)
count_1 = value_counts.get(1, 0)
labels = ['0', '1']
values = [count_0, count_1]
fig = px.pie(values=values, names=labels, title='Distribution of Content Preferences among Customers')
fig.show()
df_cust['feeds_day']
18 4
21 2
23 2
27 4
29 4
..
179951 6
179959 6
179996 6
179997 6
180013 6
Name: feeds_day, Length: 14450, dtype: int64
import plotly.express as px
fig = px.pie(df_cust, values= df_cust['feeds_day'].value_counts().values, names=df_cust['feeds_dayname'].value_counts().index, title = "Potential Customer Feeds Day Viewed Distribution")
fig.show()
df_noncust = merged[merged['label_y'] == -1.0]
merged.to_csv("merged_dataframe.csv")
import plotly.express as px
fig = px.pie(df_noncust, values='age', names='age', title = "Non-Potential Customer Age Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values='residence', names='residence', title = "Non-Potential Customer Residence Distribution")
fig.show()
value_counts = df_noncust['e_section'].value_counts()
count_0 = value_counts.get(0, 0)
count_1 = value_counts.get(1, 0)
labels = ['0', '1']
values = [count_0, count_1]
fig = px.pie(values=values, names=labels, title='Distribution of content preferences among Non-Potential Customers')
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values='series_group', names='series_group', title = "Non-Potential Customer Device Series Group Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values='series_dev', names='series_dev', title = "Non-Potential Customer Device Series Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values='emui_dev', names='emui_dev', title = "Non-Potential Customer Device EMUI Distribution")
fig.show()
df_noncust['pt_d'] = pd.to_datetime(df_noncust['pt_d'], format='%Y%m%d%H%M')
df_noncust['e_et'] = pd.to_datetime(df_noncust['e_et'], format='%Y%m%d%H%M')
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\372196138.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\372196138.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_noncust['ads_hour'] = df_noncust['pt_d'].dt.hour
df_noncust['feeds_hour'] = df_noncust['e_et'].dt.hour
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\137544654.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\137544654.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_noncust['ads_day'] = df_noncust['pt_d'].dt.dayofweek
df_noncust['feeds_day'] = df_noncust['e_et'].dt.dayofweek
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\1650693043.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\1650693043.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_noncust['ads_dayname'] = df_noncust['pt_d'].dt.day_name()
df_noncust['feeds_dayname'] = df_noncust['e_et'].dt.day_name()
C:\Users\anime\AppData\Local\Temp\ipykernel_10964\4293718302.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\anime\AppData\Local\Temp\ipykernel_10964\4293718302.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
import plotly.express as px
fig = px.pie(df_noncust, values='ads_hour', names='ads_hour', title = "Non-Potential Customer Advertisement Hour Viewed Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values= df_noncust['feeds_hour'].value_counts().values, names=df_noncust['feeds_hour'].value_counts().index, title = "Non-Potential Customer Feeds Hour Viewed Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values= df_noncust['ads_day'].value_counts().values, names=df_noncust['ads_dayname'].value_counts().index, title = "Non-Potential Customer Advertisement Day Viewed Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values= df_noncust['feeds_day'].value_counts().values, names=df_noncust['feeds_dayname'].value_counts().index, title = "Non-Potential Customer Feeds Day Viewed Distribution")
fig.show()
df_cust.to_csv("customer_df.csv")
df_cust.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro', 'ads_hour', 'feeds_hour', 'ads_day', 'feeds_day',
'ads_dayname', 'feeds_dayname'],
dtype='object')
df_noncust.to_csv("noncustomer_df.csv")
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['age'].value_counts().index, y=df_cust['age'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['age'].value_counts().index, y=df_noncust['age'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Age Distribution Differences")
fig.show()
fig = px.pie(merged, values=merged['label_y'].value_counts().values, names=merged['label_y'].value_counts().index, title = "Label Distribution")
fig.show()
merged.shape
(180123, 62)
merged['label_y'].value_counts()
-1 165673 1 14450 Name: label_y, dtype: int64
merged.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro'],
dtype='object')
ads.columns
Index(['log_id', 'label', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST', 'u_refreshTimes', 'u_feedLifeCycle'],
dtype='object')
feeds.columns
Index(['u_phonePrice', 'u_browserLifeCycle', 'u_browserMode',
'u_feedLifeCycle', 'u_refreshTimes', 'u_newsCatInterests',
'u_newsCatDislike', 'u_newsCatInterestsST', 'u_click_ca2_news',
'i_docId', 'i_s_sourceId', 'i_regionEntity', 'i_cat', 'i_entities',
'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch', 'e_m', 'e_po', 'e_pl',
'e_rn', 'e_section', 'e_et', 'label', 'cillabel', 'pro', 'user_id'],
dtype='object')
feeds.shape
(180123, 28)
feeds['cillabel'].value_counts()
-1 180046 1 77 Name: cillabel, dtype: int64
import plotly.express as px
fig = px.pie(df_cust, values='gender', names='gender', title = "Potential Customer Gender Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values='gender', names='gender', title = "Non Customer Gender Distribution")
fig.show()
df_cust['gender'].unique()
array([ 3., 2., 4., nan])
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['gender'].value_counts().index, y=df_cust['gender'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['gender'].value_counts().index, y=df_noncust['gender'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Age Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['residence'].value_counts().index, y=df_cust['residence'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['residence'].value_counts().index, y=df_noncust['residence'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Residence Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['e_section'].value_counts().index, y=df_cust['e_section'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['e_section'].value_counts().index, y=df_noncust['e_section'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Content Preference Distribution Differences")
fig.show()
df_cust['e_section'].value_counts()
0 8685 1 5765 Name: e_section, dtype: int64
df_noncust['e_section'].value_counts()
1 104133 0 61540 Name: e_section, dtype: int64
df_cust.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro', 'ads_hour', 'feeds_hour', 'ads_day', 'feeds_day',
'ads_dayname', 'feeds_dayname'],
dtype='object')
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['series_dev'].value_counts().index, y=df_cust['series_dev'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['series_dev'].value_counts().index, y=df_noncust['series_dev'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Device Series Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['series_group'].value_counts().index, y=df_cust['series_group'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['series_group'].value_counts().index, y=df_noncust['series_group'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Device Series Group Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['emui_dev'].value_counts().index, y=df_cust['emui_dev'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['emui_dev'].value_counts().index, y=df_noncust['emui_dev'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Device EMUI Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['ads_hour'].value_counts().index, y=df_cust['ads_hour'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['ads_hour'].value_counts().index, y=df_noncust['ads_hour'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Ad Hour Viewed Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['feeds_hour'].value_counts().index, y=df_cust['feeds_hour'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['feeds_hour'].value_counts().index, y=df_noncust['feeds_hour'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Feed Hour Viewed Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['feeds_day'].value_counts().index, y=df_cust['feeds_day'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['feeds_day'].value_counts().index, y=df_noncust['feeds_day'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Feed Day Viewed Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['ads_day'].value_counts().index, y=df_cust['ads_day'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['ads_day'].value_counts().index, y=df_noncust['ads_day'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Ad Day Viewed Distribution Differences")
fig.show()
df_cust.columns
Index(['log_id', 'label_x', 'user_id', 'age', 'gender', 'residence', 'city',
'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
'hispace_app_tags', 'app_second_class', 'app_score',
'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
'pt_d', 'u_newsCatInterestsST_x', 'u_refreshTimes_x',
'u_feedLifeCycle_x', 'u_phonePrice', 'u_browserLifeCycle',
'u_browserMode', 'u_feedLifeCycle_y', 'u_refreshTimes_y',
'u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST_y',
'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_regionEntity',
'i_cat', 'i_entities', 'i_dislikeTimes', 'i_upTimes', 'i_dtype', 'e_ch',
'e_m', 'e_po', 'e_pl', 'e_rn', 'e_section', 'e_et', 'label_y',
'cillabel', 'pro', 'ads_hour', 'feeds_hour', 'ads_day', 'feeds_day',
'ads_dayname', 'feeds_dayname'],
dtype='object')
import plotly.express as px
fig = px.pie(df_cust, values= df_cust['i_upTimes'].value_counts().values, names=df_cust['i_upTimes'].value_counts().index, title = "Potential Customer Article Like Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values= df_noncust['i_upTimes'].value_counts().values, names=df_noncust['i_upTimes'].value_counts().index, title = "Non-Potential Customer Article Like Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values= df_cust['i_dislikeTimes'].value_counts().values, names=df_cust['i_dislikeTimes'].value_counts().index, title = "Potential Customer Article Dislike Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_noncust, values= df_noncust['i_dislikeTimes'].value_counts().values, names=df_noncust['i_dislikeTimes'].value_counts().index, title = "Non-Potential Customer Article Dislike Distribution")
fig.show()
import plotly.express as px
fig = px.pie(df_cust, values= df_cust['pro'].value_counts().values, names=df_cust['pro'].value_counts().index, title = "Potential Customer Article Progress Distribution")
fig.show()
df_noncust['pro'].value_counts()
0 165673 Name: pro, dtype: int64
import plotly.express as px
fig = px.pie(df_noncust, values= df_noncust['pro'].value_counts().values, names=df_noncust['pro'].value_counts().index, title = "Non-Potential Customer Article Progress Distribution")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['i_upTimes'].value_counts().index, y=df_cust['i_upTimes'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['i_upTimes'].value_counts().index, y=df_noncust['i_upTimes'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Article Liked Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['i_dislikeTimes'].value_counts().index, y=df_cust['i_dislikeTimes'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['i_dislikeTimes'].value_counts().index, y=df_noncust['i_dislikeTimes'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Article Disliked Distribution Differences")
fig.show()
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Bar(name='Potential Customers', x=df_cust['pro'].value_counts().index, y=df_cust['pro'].value_counts().values),
go.Bar(name='Non Customers', x=df_noncust['pro'].value_counts().index, y=df_noncust['pro'].value_counts().values)
])
# Change the bar mode
fig.update_layout(barmode='group', title = "Grouped Barchart to Visualize Article Progress Distribution Differences")
fig.show()